#packages and set working directory
library(vegan)
library(ggplot2)
library(mctoolsr)
library(tidyr)
library(dplyr)
library(gridExtra)
library(kableExtra)
library(decontam)
set.seed(33333)
work_dir <- "/Users/alal3493/Documents/Projects/DevelopmentalBorealToad/FungalDevData/02_PublishedAnalysis/"
seq_dir <- "/Users/alal3493/Documents/Projects/DevelopmentalBorealToad/FungalDevData/00_SeqProcessing/"
Since USEARCH seq processing filtered out some OTU’s (chimeras, mito, chloro, singletons) and samples (too low reads or quality), I want to remove these from my mapping file so it matches the post-seq-processing OTU table. I’ve commented this whole chunk out because I don’t want it to overwrite the files every single time I run or knit this whole notebook.
# map <- read.delim(paste0(seq_dir,"DevelopmentalBorealToad_metadata.txt"), header = T, sep = "\t") # read in original mapping file
# rownames(map) <- map$X.SampleID # define the rownames
# head(map)
# otutab <- read.delim(paste0(seq_dir,"otutab_wTax_noChloroMitoSingl.txt"), header = T, sep = "\t") # read in OTU table post-seq-processing
# head(otutab)
# samplestouse <- colnames(otutab) # columns in OTU table are what we want to filter by
# newmap <- filter(map, rownames(map) %in% samplestouse) # filter the map rows by the above cols
# head(newmap)
# dim(otutab)
# dim(newmap) # check that both dimensions are the same between OTU table and new mapping file
# what are the samples we dropped from the original mapping file; want to check if it's heavily weighted to one sample type
# setdiff(samplestouse, rownames(map))
# the things that don't match are just the OTU ID and taxonomy
# write.table(newmap,
# paste0(work_dir,"01_cleaning/DevelopmentalBorealToad_metadata_USEARCH.txt"),
# sep="\t", quote = F, row.names = F)
# then changed first header column to be "#SampleID" as mctoolsr expects in text editor
There are a lot of sample types that won’t be useful going forward.
Some of these were filtered out manually in excel. For example, dead tadpoles or tadpole mouths that were sampled but not part of the hypotheses we have for this paper. Gloves and sterile water were sampled as controls, but had very few OTU’s and were actually not adequately replicated to be proper controls (also, the glove was sampled after being used to hold a toad, so this is not a good representation of whether the glove was contaminating toad skin or not).
mapfp <- paste0(work_dir,"01_cleaning/DevelopmentalBorealToad_metadata_USEARCH_V2.txt")
tabfp <- paste0(seq_dir,"otutab_wTax_noChloroMitoSingl.txt")
# OTU table used 97% ID OTU picking with UNITE 7.2, filtered for
# chimeras, chloroplasts, mitochondria, OTU sequences that only appear twice or less
input <- load_taxa_table(tab_fp = tabfp, map_fp = mapfp) # this is an mctoolsr package command
# 372 samples total
# each part of the "input" list:
# input$data_loaded -> this is the OTU table
# input$map_loaded) -> this is the mapping file
# input$taxonomy_loaded -> this is the taxonomy of the OTU id's
# filter out two samples were mislabeled during sample processing
# and do not actually have data associated with them
input_f <- filter_data(input = input, filter_cat = "Unique_id_marker",
filter_vals = c("BTP_226", "BTP_47")) # 370 samples remain
Rarefaction looks at how well we sampled each biom or set of samples. Rarefying then normalizes the number of sequences per sample by subsampling to a predefined number. This normalization step helps deal with sequencing biases. I picked 1030 sequences because some research has shown that anything under 1000 is no longer able to capture the community, and I did not want to throw out too many samples. However, at 1030 sequences, there are rare taxa that are probably not going to be captured during the subsampling. As a result, can only talk about the broad, whole community patterns and patterns to do with dominant groups of fungi, but likely not any rare taxa shifts.
# what should I rarefy to?
# what about the blanks and controls I had?
blanks_only <- filter_data(input_f, filter_cat = "Type", keep_vals = c("blank", "control"))
blanks_sorted <- sort(colSums(blanks_only$data_loaded))
hist(blanks_sorted)
tail(blanks_sorted)
## NTC4 Blank_ITS6_1 Blank_ITS6_35 NTC11 Blank_ITS7_27
## 7 7 8 8 10
## NTC7
## 2465
# only NTC7 is high, rest are low
# make file with only OTU and their total counts across all samples
OTU_counts <- filter_data(input_f, filter_cat = "Type", filter_vals = c("blank", "control"))
OTU_sorted <- sort(colSums(OTU_counts$data_loaded))
hist(OTU_sorted, breaks = seq(from=0,to=140000,by=1000))
abline(v=1030, col="red")
# rarefy to 1030 seqs
input_frar <- single_rarefy(input = input_f, depth = 1030)
# 173 samples remain
# order by life stages
order_list <- c("Water",
"Sediment", "Soil",
"Eggs.11.15", "Tadpole.20.22",
"Tadpole.23.25", "Tadpole.25.27",
"Tadpole.27.29", "Tadpole.29.31",
"Tadpole.31.35", "Tadpole.36.39",
"Metamorph.40.46", "Subadult", "Adult", "NA",
"NoTemplateControl", "Primer_blank")
input_frar$map_loaded$Gosner_Stage <-
factor(input_frar$map_loaded$Gosner_Stage,
levels=order_list)
# graph rarefaction curves
# filter out anything less than 1030 since these won't be in the final analysis
# with life stage simplified so it fits on the graph
levels(input_frar$map_loaded$Life_Stage_Simplified) # need 9 colors
## [1] "Adult" "Eggs" "Metamorph"
## [4] "NoTemplateControl" "Sediment" "Soil"
## [7] "Subadult" "Tadpole" "Water"
colors_list <- c("darkred", "blue", "green", "black", "orange", "purple", "darkgreen", "hotpink", "grey")
par(xpd = T, mar = par()$mar + c(0,0,0,10))
rarecurve(sapply(input_frar$data_loaded,
function(x) sapply(split(x,input_frar$map_loaded$Life_Stage_Simplified), sum)),
step=1,
xlab="Number of sequences sampled",
ylab="Number of OTUs",
label=F,
col = colors_list)
legend(70000, 200, legend = levels(input_frar$map_loaded$Life_Stage_Simplified),
col=colors_list, lty = 1, lwd = 3)
# save via the export button in RStudio, changed dpi in Preview on Mac
# return par margins to original settings
par(mar=c(5, 4, 4, 2) + 0.1)
Using Decontam package in R, I identified the contaminants in my data. I did this on the non-rarefied data, then filtered the contaminants out of the rarefied data.
mapfp <- paste0(work_dir,"01_cleaning/DevelopmentalBorealToad_metadata_contam.txt")
tabfp <- paste0(seq_dir,"otutab_wTax_noChloroMitoSingl.txt")
input <- load_taxa_table(tab_fp = tabfp, map_fp = mapfp) # 224 samples
df <- as.data.frame(input$map_loaded) # Put sample_data into a ggplot-friendly data frame
df$LibrarySize <- colSums(input$data_loaded) # new row with the library size
df <- df[order(df$LibrarySize),] # order by library size column
df$Index <- seq(nrow(df)) # make index column to keep the order from above
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Type)) + geom_point() +
geom_hline(yintercept = 1030, col="red") # line is at what I rarefied to
# "life" is toad samples, "env" is environment samples
# frequency method (uses relative abundance)
otu_tab <- t(data.matrix(input$data_loaded))
contamdf.freq <- isContaminant(otu_tab, method="frequency",
conc=input$map_loaded$quant_reading)
head(contamdf.freq)
## freq prev p.freq p.prev p contaminant
## OTU_3 0.0238758531 106 0.6914333 NA 0.6914333 FALSE
## OTU_3754 0.0004390043 5 0.5317508 NA 0.5317508 FALSE
## OTU_135 0.0017510932 11 0.5913609 NA 0.5913609 FALSE
## OTU_36 0.0036782399 24 0.6592738 NA 0.6592738 FALSE
## OTU_1037 0.0001655286 2 0.1426257 NA 0.1426257 FALSE
## OTU_1 0.1148591402 158 0.5582977 NA 0.5582977 FALSE
table(contamdf.freq$contaminant) # true = number of contaminants; 23 here
##
## FALSE TRUE
## 3130 23
head(which(contamdf.freq$contaminant)) # the row index of the contaminants (so the highest is the 51st most abundant thing)
## [1] 51 169 286 540 580 734
otu_names <- row.names(contamdf.freq)
i <- which(contamdf.freq$contaminant)
taxnames <- row.names(contamdf.freq[i,]) # these are the OTU names (rownames) of the contaminants
plot_frequency(otu_tab, taxnames, conc=input$map_loaded$quant_reading, facet=T) +
xlab("DNA Concentration (PicoGreen fluorescent intensity)")
# the only two I miiiight not trust of these is OTU_21 and OTU_54 because of the huge variation across the red dotted line (best fit line to determine if contaminant)
input$taxonomy_loaded["OTU_21",] #this one is a plant-associated microbe, so might not actually be contaminant
## taxonomy1 taxonomy2 taxonomy3 taxonomy4
## OTU_21 k__Fungi p__Ascomycota c__Dothideomycetes o__Pleosporales
## taxonomy5 taxonomy6 taxonomy7
## OTU_21 f__Pleosporaceae g__Alternaria <NA>
input$taxonomy_loaded["OTU_54",] #this one is an indoor environment microbe, so makes sense
## taxonomy1 taxonomy2 taxonomy3 taxonomy4
## OTU_54 k__Fungi p__Ascomycota c__Eurotiomycetes o__Eurotiales
## taxonomy5 taxonomy6 taxonomy7
## OTU_54 f__Aspergillaceae g__Penicillium s__Penicillium_chrysogenum
# try prevalence method (presence/absence)
# make new column with logical for control or not control
input$map_loaded$is.neg <- input$map_loaded$Type == "control"
contamdf.prev <- isContaminant(otu_tab, method="prevalence", neg=input$map_loaded$is.neg)
table(contamdf.prev$contaminant) # less things got pulled out
##
## FALSE TRUE
## 3143 10
y <- which(contamdf.prev$contaminant)
taxnames2 <- row.names(contamdf.prev[y,])
# did this frequency plot even though it's prevalence because it's more intuitive to check distribution
plot_frequency(otu_tab, taxnames2, conc=input$map_loaded$quant_reading, facet=T) +
xlab("DNA Concentration (PicoGreen fluorescent intensity)")
# compare the two methods
intersect(taxnames, taxnames2)
## [1] "OTU_21"
# going to keep OTU_21 a contaminant since it's pulled out by both methods
setdiff(taxnames, taxnames2)
## [1] "OTU_54" "OTU_240" "OTU_562" "OTU_783" "OTU_3502" "OTU_1463"
## [7] "OTU_750" "OTU_2814" "OTU_1640" "OTU_2154" "OTU_6749" "OTU_4923"
## [13] "OTU_3852" "OTU_3347" "OTU_1797" "OTU_3496" "OTU_5644" "OTU_6004"
## [19] "OTU_6743" "OTU_3612" "OTU_6615" "OTU_6811"
setdiff(taxnames2, taxnames)
## [1] "OTU_225" "OTU_260" "OTU_6623" "OTU_429" "OTU_5960" "OTU_1637" "OTU_2806"
## [8] "OTU_4676" "OTU_4497"
#Combine the lists
full_contam_taxlist <- unique(c(taxnames, taxnames2))
# filter out of rarefied input
input_frar_clean <- filter_taxa_from_input(input_frar,
taxa_IDs_to_remove = full_contam_taxlist)
# 25 taxa removed (some were already removed in past filtering steps)
# make a table of all the taxonomic names of these isolates
input_contams <- filter_taxa_from_input(input_frar,
taxa_IDs_to_keep = full_contam_taxlist)
taxons <- input_contams$taxonomy_loaded
contams_table <- data.frame(OTU_id=rownames(taxons),
Taxonomy=paste0(taxons$taxonomy1,"; ",taxons$taxonomy2,"; ",taxons$taxonomy3,"; ",taxons$taxonomy4,"; ",taxons$taxonomy5,"; ",taxons$taxonomy6,"; ",taxons$taxonomy7,"; "))
kable(contams_table) %>%
kable_styling(bootstrap_options = c("striped","bordered"),
full_width = F, position = "left")
| OTU_id | Taxonomy |
|---|---|
| OTU_21 | k__Fungi; p__Ascomycota; c__Dothideomycetes; o__Pleosporales; f__Pleosporaceae; g__Alternaria; NA; |
| OTU_225 | k__Fungi; p__Basidiomycota; c__Tremellomycetes; o__Filobasidiales; NA; NA; NA; |
| OTU_260 | k__Fungi; p__Basidiomycota; c__Agaricomycetes; o__Polyporales; f__Coriolaceae; g__Trametes; s__Trametes_hirsuta; |
| OTU_54 | k__Fungi; p__Ascomycota; c__Eurotiomycetes; o__Eurotiales; f__Aspergillaceae; g__Penicillium; s__Penicillium_chrysogenum; |
| OTU_240 | k__Fungi; p__Ascomycota; c__Dothideomycetes; o__Capnodiales; f__Cladosporiaceae; g__Cladosporium; s__Cladosporium_delicatulum; |
| OTU_6623 | k__Fungi; p__Ascomycota; c__Eurotiomycetes; o__Eurotiales; f__Aspergillaceae; g__Penicillium; s__Penicillium_bialowiezense; |
| OTU_429 | k__Fungi; p__Ascomycota; c__Dothideomycetes; o__Pleosporales; NA; NA; NA; |
| OTU_5960 | k__Fungi; p__Ascomycota; c__Dothideomycetes; o__Pleosporales; NA; NA; NA; |
| OTU_562 | k__Fungi; p__Basidiomycota; c__Agaricomycetes; o__Polyporales; f__Meruliaceae; g__Scopuloides; s__unidentified; |
| OTU_783 | k__Fungi; p__Ascomycota; c__Leotiomycetes; o__Helotiales; f__Hyaloscyphaceae; g__Lachnum; s__Lachnum_pulverulentum; |
| OTU_3502 | k__Fungi; p__Ascomycota; c__Sordariomycetes; o__Hypocreales; f__Nectriaceae; g__Volutella; s__unidentified; |
| OTU_1463 | k__Fungi; p__Basidiomycota; c__Agaricomycetes; o__Agaricales; f__Cortinariaceae; g__Gymnopilus; NA; |
| OTU_750 | k__Fungi; p__Basidiomycota; c__Agaricomycetes; o__Auriculariales; f__Auriculariaceae; g__Auricularia; s__unidentified; |
| OTU_2814 | k__Fungi; p__Ascomycota; NA; NA; NA; NA; NA; |
| OTU_1637 | k__Fungi; p__Ascomycota; c__Sordariomycetes; o__Diaporthales; f__Schizoparmaceae; g__Coniella; NA; |
| OTU_2806 | k__Fungi; p__Basidiomycota; c__Microbotryomycetes; o__Leucosporidiales; f__Leucosporidiaceae; g__Leucosporidium; s__Leucosporidium_intermedium; |
| OTU_1640 | k__Fungi; p__Ascomycota; c__Sordariomycetes; o__Xylariales; f__Apiosporaceae; g__Arthrinium; s__Arthrinium_aureum; |
| OTU_2154 | k__Fungi; p__Ascomycota; c__Leotiomycetes; o__Helotiales; f__Helotiaceae; g__Claussenomyces; s__unidentified; |
| OTU_4923 | k__Fungi; p__Ascomycota; c__Saccharomycetes; o__Saccharomycetales; f__Phaffomycetaceae; g__Wickerhamomyces; s__Wickerhamomyces_anomalus; |
| OTU_3852 | k__Fungi; p__Ascomycota; c__Sordariomycetes; o__Sordariales; f__Lasiosphaeriaceae; g__Schizothecium; s__Schizothecium_glutinans; |
| OTU_1797 | k__Fungi; p__Ascomycota; c__Leotiomycetes; o__Helotiales; f__Helotiaceae; g__Collophora; s__Collophora_hispanica; |
| OTU_3496 | k__Fungi; p__Ascomycota; c__Saccharomycetes; o__Saccharomycetales; f__Saccharomycetaceae; g__Kazachstania; s__Kazachstania_unispora; |
| OTU_5644 | k__Fungi; p__Ascomycota; c__Saccharomycetes; o__Saccharomycetales; f__Saccharomycetaceae; g__Kazachstania; s__Kazachstania_humilis; |
| OTU_6004 | k__Fungi; NA; NA; NA; NA; NA; NA; |
# write.table(contams_table,
# paste0(work_dir,"Figs_Tables/contam_names.txt"),
# sep = "\t", quote = F, row.names=F)
Now I need to filter my data into two useful subsets that I’ll be using in future analsyses: one with only the toad samples and one with both toad and environmental samples
# filter so only life stage data
input_frar_lifeonly <- filter_data(input = input_frar_clean,
filter_cat = "Gosner_Stage",
filter_vals = c("Water", "Sediment", "Soil",
"Primer_blank" ,"NoTemplateControl", "NA"))
# 124 samples remaining
# order by life stage
life_list <- c("Eggs.11.15", "Tadpole.20.22",
"Tadpole.23.25", "Tadpole.25.27",
"Tadpole.27.29", "Tadpole.29.31",
"Tadpole.31.35", "Tadpole.36.39",
"Metamorph.40.46", "Subadult", "Adult")
input_frar_lifeonly$map_loaded$Gosner_Stage <-
factor(input_frar_lifeonly$map_loaded$Gosner_Stage,
levels=life_list)
# save this file
# export_taxa_table(input_frar_lifeonly, paste0(work_dir,"Toad_clean_otutab.txt"),
# paste0(work_dir,"Toad_clean_maptab.txt"))
# filter so only life stages and env data
input_frar_lifeenv <- filter_data(input = input_frar_clean,
filter_cat = "Gosner_Stage",
filter_vals = c("Primer_blank" ,"NoTemplateControl", "NA"))
# 172 samples remaining
# order by env then life stages
lifeenv_list2 <- c("Soil", "Sediment", "Water", "Eggs",
"Tadpole", "Metamorph", "Subadult", "Adult")
input_frar_lifeenv$map_loaded$Life_Stage_Simplified <-
factor(input_frar_lifeenv$map_loaded$Life_Stage_Simplified,
levels=lifeenv_list2)
# export_taxa_table(input_frar_lifeenv, paste0(work_dir,"Allsamps_clean_otutab.txt"),
# paste0(work_dir,"Allsamps_clean_maptab.txt"))
# number of sequences total after rarefaction
sum(colSums(input_frar_lifeenv$data_loaded)) # 171724
## [1] 171670
# number of OTU's in this file
nrow(input_frar_lifeenv$taxonomy_loaded) # 2243
## [1] 2240
# sample sizes table
samp_sizes <- input_frar_lifeenv$map_loaded %>%
group_by(Life_Stage_Simplified) %>%
summarise(Counts=n()) %>% # find counts for the group_by variable, colname is "Counts"
rename(`Sample Type` = Life_Stage_Simplified) %>% # rename Life_Stage_Simplified col
slice(match(lifeenv_list2, `Sample Type`)) %>% # reorder Sample Type col by custom list
bind_rows(summarise_all(., funs(if(is.numeric(.)) sum(.) else "Total")))
kable(samp_sizes) %>%
kable_styling(bootstrap_options = c("striped","bordered"),
full_width = F, position = "left")
| Sample Type | Counts |
|---|---|
| Soil | 3 |
| Sediment | 25 |
| Water | 20 |
| Eggs | 12 |
| Tadpole | 66 |
| Metamorph | 25 |
| Subadult | 5 |
| Adult | 16 |
| Total | 172 |
# write.table(samp_sizes,
# paste0(work_dir,"Figs_Tables/samp_sizes.txt"),
# sep = "\t", quote = F, row.names=F)
This dataset has high resolution between tadpole groups. Preliminary analyses show that some of these may or may not be entirely similar to each other in their fungal communities. Here, I evaluated whether Pairwise permanova was significant with Bonferroni (p <= 0.091) and FDR (p <= 0.05) correction methods; in general, both correction methods were fairly similar in their results and tadpoles could be lumped.
# Calculate dissimilarities between life stages
dm_rel <- convert_to_relative_abundances(input_frar_lifeenv) # relativize species matrix first
dm_lifeenv <- calc_dm(dm_rel$data_loaded, method = "bray") # calc bray-curtis distance between communities, with square root transformation
# Do pairwise PERMANOVA
PW_PERM_lifeenv <- calc_pairwise_permanovas(dm_lifeenv,
metadata_map = input_frar_lifeenv$map_loaded,
compare_header = "Gosner_Stage")
PW_PERM_lumpq <- PW_PERM_lifeenv %>%
arrange(pvalBon, pvalFDR) %>%
filter(pvalBon <= 0.09100) %>%
filter(pvalFDR <= 0.05)
# write.table(PW_PERM_lumpq, paste0(work_dir,"Figs_Tables/PW_PERM.txt"), sep="\t")
# plot dendrogram to visualize mean dissimilarities between sample types
dm_mean <- calc_mean_dissimilarities(dm_lifeenv,
metadata_map = input_frar_lifeenv$map_loaded,
summarize_by_factor = "Gosner_Stage",
return_map = T)
dendro_x <- mctoolsr::plot_dendrogram(dm_mean$dm_loaded,
dm_mean$map_loaded,
labels="Gosner_Stage",
color_by = "Life_Stage_Simplified",
method = "average") +
labs(color="Sample Type")
dendro_x
# result: seems like tadpoles can either all be lumped together or into EarlyandMid/Late clumps
# make paneled figure with table and graph
grid.arrange(tableGrob(PW_PERM_lumpq), dendro_x, ncol=2)
# saved through export button in Rstudio